# Import all libraries needed for the tutorial
# General syntax to import specific functions in a library:
##from (library) import (specific library function)
from pandas import DataFrame, read_csv
# General syntax to import a library but no functions:
##import (library) as (give the library a nickname/alias)
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing # to normalise existing X
from nltk.corpus import stopwords
import folium as folium
from sklearn.preprocessing import label_binarize
from collections import Counter
from sklearn.metrics import classification_report
from itertools import cycle
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300
from scipy import interp
import pylab as pl
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from os import listdir
from os.path import isfile, join
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics.pairwise
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import roc_auc_score
from wordcloud import WordCloud
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import warnings
# Enable inline plotting
%matplotlib inline
business_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/business"
entertainment_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/entertainment"
politics_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/politics"
sport_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/sport"
tech_path="/home/prokopis/Desktop/Data Mining Εργασία 2/fulltext/data/tech"
unique_num=0
col_names = ['Id', 'Title', 'Content','Category']
Data=pd.DataFrame(columns = col_names)
businessfiles = [f for f in listdir(business_path) if isfile(join(business_path, f))]
entertainmentfiles = [f for f in listdir(entertainment_path) if isfile(join(entertainment_path, f))]
politicsfiles = [f for f in listdir(politics_path) if isfile(join(politics_path, f))]
sportfiles = [f for f in listdir(sport_path) if isfile(join(sport_path, f))]
techfiles=[f for f in listdir(tech_path) if isfile(join(tech_path, f))]
Category='business'
for i in businessfiles:
val=business_path+'/'+i
with open(val) as f:
lines = f.read().splitlines()
title=lines[0]
Content=""
for j in range(1,len(lines)):
Content=Content+" "+lines[j]
size=Data.shape[0]
Data.loc[size]=[unique_num,title,Content,Category]
unique_num=unique_num+1
Category='entertainment'
for i in entertainmentfiles:
val=entertainment_path+'/'+i
with open(val) as f:
lines = f.read().splitlines()
title=lines[0]
Content=""
for j in range(1,len(lines)):
Content=Content+" "+lines[j]
size=Data.shape[0]
Data.loc[size]=[unique_num,title,Content,Category]
unique_num=unique_num+1
Category='politics'
for i in politicsfiles:
val=politics_path+'/'+i
with open(val) as f:
lines = f.read().splitlines()
title=lines[0]
Content=""
for j in range(1,len(lines)):
Content=Content+" "+lines[j]
size=Data.shape[0]
Data.loc[size]=[unique_num,title,Content,Category]
unique_num=unique_num+1
Category='sport'
for i in sportfiles:
val=sport_path+'/'+i
with open(val,encoding="utf8", errors='ignore') as f:
lines = f.read().splitlines()
title=lines[0]
Content=""
for j in range(1,len(lines)):
Content=Content+" "+lines[j]
size=Data.shape[0]
Data.loc[size]=[unique_num,title,Content,Category]
unique_num=unique_num+1
Category='tech'
for i in techfiles:
val=tech_path+'/'+i
with open(val) as f:
lines = f.read().splitlines()
title=lines[0]
Content=""
for j in range(1,len(lines)):
Content=Content+" "+lines[j]
size=Data.shape[0]
Data.loc[size]=[unique_num,title,Content,Category]
unique_num=unique_num+1
train, test = train_test_split(Data,test_size=0.2,stratify=Data[['Category']])
train.to_csv('train_set.tsv', sep='\t')
y_Correct=test['Category']
test=test.drop('Category',1)
test.to_csv('test_set.tsv', sep='\t')
#1
str1=""
str2=""
str3=""
str4=""
str5=""
w1=Data[Data["Category"]=='business']
w2=Data[Data["Category"]=='entertainment']
w3=Data[Data["Category"]=='politics']
w4=Data[Data["Category"]=='sport']
w5=Data[Data["Category"]=='tech']
for t in w1['Content']:
str1=str1+" "+ t
for t in w2['Content']:
str2=str2+" " + t
for t in w3['Content']:
str3=str3+" " + t
for t in w4['Content']:
str4=str4+" "+ t
for t in w5['Content']:
str5=str5+" "+ t
wordcloud1 = WordCloud(width=3000, height=3000,background_color="white").generate(str1)
plt.figure(figsize=(100,100))
plt.imshow(wordcloud1)
plt.axis("off")
plt.show()
wordcloud2 = WordCloud(width=3000, height=3000,background_color="white").generate(str2)
plt.figure(figsize=(100,100))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()
wordcloud3 = WordCloud(width=3000, height=3000,background_color="white").generate(str3)
plt.figure(figsize=(100,100))
plt.imshow(wordcloud3)
plt.axis("off")
plt.show()
wordcloud4 = WordCloud(width=3000, height=3000,background_color="white").generate(str4)
plt.figure(figsize=(100,100))
plt.imshow(wordcloud4)
plt.axis("off")
plt.show()
wordcloud5 = WordCloud(width=3000, height=3000,background_color="white").generate(str5)
plt.figure(figsize=(100,100))
plt.imshow(wordcloud5)
plt.axis("off")
plt.show()
#2
x_train=train['Content']
y_train=train['Category']
x_test=test['Content']
###Input1
count_train = CountVectorizer()
count1= count_train.fit_transform(x_train)
count2=count_train.transform(x_test)
###Input2
tf_train=TfidfVectorizer()
x = tf_train.fit_transform(x_train)
y=tf_train.transform(x_test)
warnings.filterwarnings('ignore')
####Support Vector Machine -Input1
C=[0.1, 1, 10, 100, 1000]
num1=np.random.randint(0,3)
gamma=[1, 0.1, 0.01, 0.001, 0.0001]
num2=np.random.randint(0,3)
kernel=['rbf','linear']
num3=np.random.randint(0,1)
kernel_input=kernel[num3]
gamma_input=gamma[num2]
C_input=C[num1]
clf = svm.SVC(kernel=kernel_input,gamma=gamma_input,C=C_input,probability=True) # Linear Kernel
clf.fit(count1, y_train)
y_pred1 = clf.predict(count2)
result1=cross_val_score(clf,count1,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,count1,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,count1,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,count1,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(count1, y_train).predict_proba(count2)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
####Support Vector Machine -Input2
C=[0.1, 1, 10, 100, 1000]
num1=np.random.randint(0,3)
gamma=[1, 0.1, 0.01, 0.001, 0.0001]
num2=np.random.randint(0,3)
kernel=['rbf','linear']
num3=np.random.randint(0,1)
kernel_input=kernel[num3]
gamma_input=gamma[num2]
C_input=C[num1]
clf = svm.SVC(kernel=kernel_input,gamma=gamma_input,C=C_input,probability=True) # Linear Kernel
clf.fit(x, y_train)
y_pred2=clf.predict(y)
result1=cross_val_score(clf,x,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,x,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,x,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,x,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(x, y_train).predict_proba(y)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
lw = 2
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
###Random Forests -Input1
clf=RandomForestClassifier()
clf.fit(count1,y_train)
y_pred1 = clf.predict(count2)
result1=cross_val_score(clf,count1,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,count1,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,count1,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,count1,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(count1, y_train).predict_proba(count2)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
###Random Forests -Input2
clf=RandomForestClassifier()
clf.fit(x, y_train)
y_pred2=clf.predict(y)
result1=cross_val_score(clf,x,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,x,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,x,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,x,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=clf.fit(x, y_train).predict_proba(y)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
##Naive Bayes-Input1
gnb = GaussianNB()
count1=count1.toarray()
gnb.fit(count1,y_train)
count2=count2.toarray()
y_pred1= gnb.predict(count2)
result1=cross_val_score(clf,count1,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,count1,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,count1,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,count1,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=gnb.fit(count1, y_train).predict_proba(count2)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
##Naive Bayes-Input2
gnb = GaussianNB()
x=x.toarray()
gnb.fit(x, y_train)
y=y.toarray()
y_pred2=gnb.predict(y)
result1=cross_val_score(clf,x,y_train,cv=10,scoring='accuracy')
print("Accuracy: %.2f%%" % (result1.mean()*100.0))
result2=cross_val_score(clf,x,y_train,cv=10,scoring='precision_macro')
result3=cross_val_score(clf,x,y_train,cv=10,scoring='recall_macro')
result4=cross_val_score(clf,x,y_train,cv=10,scoring='f1_weighted')
print("recall: %.2f%%" % (result3.mean()*100.0))
print("Precision: %.2f%%" % (result2.mean()*100.0))
print("F-Measure:%.2f%%" % (result4.mean()*100.0))
y_score=gnb.fit(x, y_train).predict_proba(y)
n_classes = 5
actual = y_Correct.to_numpy()
labels=y_Correct.unique()
actual = label_binarize(actual, classes=labels)
fpr = dict()
tpr = dict()
roc_auc = dict()
y_pred1=np.array(y_pred1)
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(actual[:, i], y_score[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
fpr["micro"], tpr["micro"], _ = roc_curve(actual.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# ROC curve
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
print("\tAUC(macro): ", roc_auc["macro"])
ret_roc_auc = roc_auc["macro"]
# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['green', 'darkorange', 'cornflowerblue','red','blue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
##K-Nearest Neighbor-Input1
val1=x_test.shape
val=val1[0]
k=3
inputtrain=count_train.transform(np.array(x_train))
inputtest=count_train.transform(np.array(x_test))
def predict(X_train, y_train, X_test, k,g):
distances = []
topKcategories = []
for i in range(g):
distance = sklearn.metrics.pairwise.cosine_distances(X_test, X_train[i])
distance = distance[0][0]
distances.append([distance, i])
distances = sorted(distances)
for i in range(k):
index = distances[i][1]
topKcategories.append(y_train[index])
most_common_category = Counter(topKcategories).most_common(1)[0][0]
return most_common_category
total_Accuracy=0.0
total_precision=0.0
total_recall=0.0
total_f1=0.0
for kf in range(10):
predictions = []
for i in range(val):
g=x_train.shape[0]
predictions.append(predict(inputtrain, np.array(y_train), inputtest[i], k,g))
predictions = np.asarray(predictions)
y_score1=predictions
Accuracy=accuracy_score(y_Correct,y_score1)
precision=precision_score(y_Correct,y_score1, average='macro')
recall=recall_score(y_Correct,y_score1, average='macro')
f1=f1_score(y_Correct,y_score1, average='macro')
total_Accuracy+=Accuracy
total_precision+=precision
total_recall+=recall
total_f1+=f1
total_Accuracy/=10
total_precision/=10
total_recall/=10
total_f1/=10
print("Accuracy: %.2f%%" % (total_Accuracy.mean()*100.0))
print("Precision: %.2f%%" % (total_precision.mean()*100.0))
print("Recall: %.2f%%" % (total_recall.mean()*100.0))
print("F-Measure: %.2f%%" % (total_f1.mean()*100.0))
##K-Nearest Neighbor-Input2
val1=x_test.shape
val=val1[0]
k=3
x = tf_train.transform(np.array(x_train))
y=tf_train.transform(np.array(x_test))
def predict(X_train, y_train, X_test, k,g):
distances = []
topKcategories = []
for i in range(g):
distance = sklearn.metrics.pairwise.cosine_distances(X_test, X_train[i])
distance = distance[0][0]
distances.append([distance, i])
distances = sorted(distances)
for i in range(k):
index = distances[i][1]
topKcategories.append(y_train[index])
most_common_category = Counter(topKcategories).most_common(1)[0][0]
return most_common_category
total_Accuracy=0.0
total_precision=0.0
total_recall=0.0
total_f1=0.0
for kf in range(10):
predictions1 = []
for i in range(val):
g=x_train.shape[0]
predictions1.append(predict(x,np.array(y_train), y[i], k,g))
predictions1 = np.asarray(predictions1)
y_score2=predictions1
Accuracy=accuracy_score(y_Correct,y_score2)
precision=precision_score(y_Correct,y_score2, average='macro')
recall=recall_score(y_Correct,y_score2, average='macro')
f1=f1_score(y_Correct,y_score2, average='macro')
total_Accuracy+=Accuracy
total_precision+=precision
total_recall+=recall
total_f1+=f1
total_Accuracy/=10
total_precision/=10
total_recall/=10
total_f1/=10
print("Accuracy: %.2f%%" % (total_Accuracy.mean()*100.0))
print("Precision: %.2f%%" % (total_precision.mean()*100.0))
print("Recall: %.2f%%" % (total_recall.mean()*100.0))
print("F-Measure: %.2f%%" % (total_f1.mean()*100.0))
#4
#input1
count1 = CountVectorizer()
in1 = count1.fit_transform(test['Content'])
#input2
tfidf=TfidfVectorizer()
in2= tfidf.fit_transform(test['Content'])
#input3
model = Word2Vec(test['Content'], min_count=1)
in3 = model[model.wv.vocab]
##input1
In1_Norm = preprocessing.normalize(in1)
km2 = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In1_Norm)
predicted_value1=km2.predict(In1_Norm)
##input2
In2_Norm = preprocessing.normalize(in2)
km2 = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In2_Norm)
predicted_value2=km2.predict(In2_Norm)
##input3
In3_Norm = preprocessing.normalize(in3)
km2 = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In3_Norm)
predicted_value3=km2.predict(In3_Norm)
#Input1
km = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In1_Norm)
test['Category']=km.labels_
test['Segment']=test['Category'].map({0:'business',1:'entertainment',2:'politics',3:'sport',4:'tech'})
x_axis=test['Content']
y_Axis=test['Category']
seaborn.set(rc={'figure.figsize':(8,5)})
seaborn.scatterplot(x_axis ,y_Axis,hue =test['Segment'],palette=['g','r','c','m','b'])
plt.title('CLusters by PCA Compontents')
plt.savefig('Principal Components Analysis-Input1.png',dpi=1000)
#Input2
km = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In2_Norm)
test['Category']=km.labels_
test['Segment']=test['Category'].map({0:'business',1:'entertainment',2:'politics',3:'sport',4:'tech'})
x_axis=test['Content']
y_Axis=test['Category']
fig=plt.figure(figsize=(6,3))
fig.set_size_inches(5, 8)
seaborn.scatterplot(x_axis ,y_Axis,hue =test['Segment'],palette=['g','r','c','m','b'])
plt.title('CLusters by PCA Compontents')
plt.savefig('Principal Components Analysis-Input2.png',dpi=1000)
#Input3
km = KMeans(n_clusters=5,init='k-means++',random_state=42).fit(In3_Norm)
e=pd.DataFrame(columns=['Category'])
e['Category']=km.labels_
test['Segment']=e['Category'].map({0:'business',1:'entertainment',2:'politics',3:'sport',4:'tech'})
x_axis=test['Content']
y_Axis=e['Category']
seaborn.set(rc={'figure.figsize':(8,5)})
seaborn.scatterplot(x_axis ,y_Axis,hue =test['Segment'],palette=['g','r','c','m','b'])
plt.title('CLusters by PCA Compontents')
plt.savefig('Principal Components Analysis-Input3.png',dpi=1000)
###
#ΟΝΟΜΑΤΕΠΩΝΥΜΟ:ΠΡΟΚΟΠΙΟΣ ΣΤΑΜΕΛΙΑΣ
#ΑΡΙΘΜΟΣ ΜΗΤΡΩΟΥ:1115201400190
#Δεν έχει υλοποιηθεί το roc plot για το knn classification
#Έχει ένα error στην εμφάνιση σε 2d η συμπίεση με τις εικόνες στο size δεν κατάφερα να το επιλύσω παρόλο που εμφανίζονται οι εικόνες !
#Τα αποτελέσματα στη μέθοδο συμπίεσης δεν εμφανίζονται στη οθόνη αλλά σε .png εικόνες !Είναι μέσα στο φάκελο της εργασίας που απέστειλα και οι τρεις εικόνες αντίστοιχα για τις αναπαραστάσεις των κιμένων!
#1-CountVectorizer
#2-Tfidf
#3-Word2vec
#Επίσης στο input2 βγάζει το ίδιο error με το input1 αλλά επειδή το έτρεξα πάλι τελευταία στιγνή για να προλάβω να το στείλω δεν ολοκληρώθηκε !
#
#
##